output.var = params$output.var
transform.abs = FALSE
log.pred = params$log.pred
norm.pred = params$norm.pred
eda = params$eda
algo.forward.caret = params$algo.forward.caret
algo.backward.caret = params$algo.backward.caret
algo.stepwise.caret = params$algo.stepwise.caret
algo.LASSO.caret = params$algo.LASSO.caret
algo.LARS.caret = params$algo.LARS.caret
message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 9
## $ output.var : chr "y3"
## $ log.pred : logi FALSE
## $ norm.pred : logi TRUE
## $ eda : logi TRUE
## $ algo.forward.caret : logi FALSE
## $ algo.backward.caret: logi FALSE
## $ algo.stepwise.caret: logi FALSE
## $ algo.LASSO.caret : logi FALSE
## $ algo.LARS.caret : logi FALSE
# Setup Labels
#output.var.tr = if (log.pred == TRUE) paste0(output.var,'.log') else output.var.tr = output.var
output.var.tr = if (log.pred == TRUE) paste0(output.var,'.cuberoot') else output.var.tr = output.var
output.var.tr = if (norm.pred == TRUE) paste0(output.var,'.bestnorm') else output.var.tr = output.var
feat = read.csv('../../Data/features_highprec.csv')
labels = read.csv('../../Data/labels.csv')
predictors = names(dplyr::select(feat,-JobName))
data.ori = inner_join(feat,labels,by='JobName')
#data.ori = inner_join(feat,select_at(labels,c('JobName',output.var)),by='JobName')
cc = complete.cases(data.ori)
data.notComplete = data.ori[! cc,]
data = data.ori[cc,] %>% select_at(c(predictors,output.var,'JobName'))
message('Original cases: ',nrow(data.ori))
## Original cases: 10000
message('Non-Complete cases: ',nrow(data.notComplete))
## Non-Complete cases: 3020
message('Complete cases: ',nrow(data))
## Complete cases: 6980
summary(dplyr::select_at(data,c('JobName',output.var)))
## JobName y3
## Job_00001: 1 Min. : 95.91
## Job_00002: 1 1st Qu.:118.29
## Job_00003: 1 Median :124.03
## Job_00004: 1 Mean :125.40
## Job_00007: 1 3rd Qu.:131.06
## Job_00008: 1 Max. :193.73
## (Other) :6974
The Output Variable y3 shows right skewness, so will proceed with a log transformation
df=gather(select_at(data,output.var))
ggplot(df, aes(x=value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density()
#stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
ggplot(gather(select_at(data,output.var)), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
#if(log.pred==TRUE) data[[output.var.tr]] = log(data[[output.var]],10) else
if(log.pred==TRUE) data[[output.var.tr]] = (data[[output.var]])^(1/3) else
data[[output.var.tr]] = data[[output.var]]
df=gather(select_at(data,c(output.var,output.var.tr)))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=2)
ggplot(gather(select_at(data,c(output.var,output.var.tr))), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
Normalization of y3 using bestNormalize package. (suggested orderNorm) This is cool, but I think is too far for the objective of the project
if (norm.pred == TRUE){
t=bestNormalize::bestNormalize(data[[output.var]])
t
qqnorm(data[[output.var]])
qqnorm(predict(t))
data[[output.var.tr]] = predict(t)
}
orderNorm() is a rank-based procedure by which the values of a vector are mapped to their percentile, which is then mapped to the same percentile of the normal distribution. Without the presence of ties, this essentially guarantees that the transformation leads to a uniform distribution
data$x2byx1 = data$x2/data$x1
data$x6byx5 = data$x6/data$x5
data$x9byx7 = data$x9/data$x7
data$x10byx8 = data$x10/data$x8
data$x14byx12 = data$x14/data$x12
data$x15byx13 = data$x15/data$x13
data$x17byx16 = data$x17/data$x16
data$x19byx18 = data$x19/data$x18
data$x21byx20 = data$x21/data$x20
data$x23byx22 = data$x23/data$x22
data$x1log = log(data$x1)
data$x2log = log(data$x2)
data$x5log = log(data$x5)
data$x6log = log(data$x6)
data$x7log = log(data$x7)
data$x9log = log(data$x9)
data$x8log = log(data$x8)
data$x10log = log(data$x10)
data$x12log = log(data$x12)
data$x14log = log(data$x14)
data$x13log = log(data$x13)
data$x15log = log(data$x15)
data$x16log = log(data$x16)
data$x17log = log(data$x17)
data$x18log = log(data$x18)
data$x19log = log(data$x19)
data$x20log = log(data$x20)
data$x21log = log(data$x21)
data$x22log = log(data$x22)
data$x23log = log(data$x23)
data$x11log = log(data$x11)
data$x1sqinv = 1/(data$x1)^2
data$x5sqinv = 1/(data$x5)^2
data$x7sqinv = 1/(data$x7)^2
data$x8sqinv = 1/(data$x8)^2
data$x12sqinv = 1/(data$x12)^2
data$x13sqinv = 1/(data$x13)^2
data$x16sqinv = 1/(data$x16)^2
data$x18sqinv = 1/(data$x18)^2
data$x20sqinv = 1/(data$x20)^2
data$x22sqinv = 1/(data$x22)^2
predictors
## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10" "x11"
## [12] "x12" "x13" "x14" "x15" "x16" "x17" "x18" "x19" "x20" "x21" "x22"
## [23] "x23" "stat1" "stat2" "stat3" "stat4" "stat5" "stat6" "stat7" "stat8" "stat9" "stat10"
## [34] "stat11" "stat12" "stat13" "stat14" "stat15" "stat16" "stat17" "stat18" "stat19" "stat20" "stat21"
## [45] "stat22" "stat23" "stat24" "stat25" "stat26" "stat27" "stat28" "stat29" "stat30" "stat31" "stat32"
## [56] "stat33" "stat34" "stat35" "stat36" "stat37" "stat38" "stat39" "stat40" "stat41" "stat42" "stat43"
## [67] "stat44" "stat45" "stat46" "stat47" "stat48" "stat49" "stat50" "stat51" "stat52" "stat53" "stat54"
## [78] "stat55" "stat56" "stat57" "stat58" "stat59" "stat60" "stat61" "stat62" "stat63" "stat64" "stat65"
## [89] "stat66" "stat67" "stat68" "stat69" "stat70" "stat71" "stat72" "stat73" "stat74" "stat75" "stat76"
## [100] "stat77" "stat78" "stat79" "stat80" "stat81" "stat82" "stat83" "stat84" "stat85" "stat86" "stat87"
## [111] "stat88" "stat89" "stat90" "stat91" "stat92" "stat93" "stat94" "stat95" "stat96" "stat97" "stat98"
## [122] "stat99" "stat100" "stat101" "stat102" "stat103" "stat104" "stat105" "stat106" "stat107" "stat108" "stat109"
## [133] "stat110" "stat111" "stat112" "stat113" "stat114" "stat115" "stat116" "stat117" "stat118" "stat119" "stat120"
## [144] "stat121" "stat122" "stat123" "stat124" "stat125" "stat126" "stat127" "stat128" "stat129" "stat130" "stat131"
## [155] "stat132" "stat133" "stat134" "stat135" "stat136" "stat137" "stat138" "stat139" "stat140" "stat141" "stat142"
## [166] "stat143" "stat144" "stat145" "stat146" "stat147" "stat148" "stat149" "stat150" "stat151" "stat152" "stat153"
## [177] "stat154" "stat155" "stat156" "stat157" "stat158" "stat159" "stat160" "stat161" "stat162" "stat163" "stat164"
## [188] "stat165" "stat166" "stat167" "stat168" "stat169" "stat170" "stat171" "stat172" "stat173" "stat174" "stat175"
## [199] "stat176" "stat177" "stat178" "stat179" "stat180" "stat181" "stat182" "stat183" "stat184" "stat185" "stat186"
## [210] "stat187" "stat188" "stat189" "stat190" "stat191" "stat192" "stat193" "stat194" "stat195" "stat196" "stat197"
## [221] "stat198" "stat199" "stat200" "stat201" "stat202" "stat203" "stat204" "stat205" "stat206" "stat207" "stat208"
## [232] "stat209" "stat210" "stat211" "stat212" "stat213" "stat214" "stat215" "stat216" "stat217"
controlled.vars = colnames(data)[grep("^x",colnames(data))]
stat.vars = colnames(data)[grep("^stat",colnames(data))]
predictors = c(controlled.vars,stat.vars)
predictors
## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10"
## [11] "x11" "x12" "x13" "x14" "x15" "x16" "x17" "x18" "x19" "x20"
## [21] "x21" "x22" "x23" "x2byx1" "x6byx5" "x9byx7" "x10byx8" "x14byx12" "x15byx13" "x17byx16"
## [31] "x19byx18" "x21byx20" "x23byx22" "x1log" "x2log" "x5log" "x6log" "x7log" "x9log" "x8log"
## [41] "x10log" "x12log" "x14log" "x13log" "x15log" "x16log" "x17log" "x18log" "x19log" "x20log"
## [51] "x21log" "x22log" "x23log" "x11log" "x1sqinv" "x5sqinv" "x7sqinv" "x8sqinv" "x12sqinv" "x13sqinv"
## [61] "x16sqinv" "x18sqinv" "x20sqinv" "x22sqinv" "stat1" "stat2" "stat3" "stat4" "stat5" "stat6"
## [71] "stat7" "stat8" "stat9" "stat10" "stat11" "stat12" "stat13" "stat14" "stat15" "stat16"
## [81] "stat17" "stat18" "stat19" "stat20" "stat21" "stat22" "stat23" "stat24" "stat25" "stat26"
## [91] "stat27" "stat28" "stat29" "stat30" "stat31" "stat32" "stat33" "stat34" "stat35" "stat36"
## [101] "stat37" "stat38" "stat39" "stat40" "stat41" "stat42" "stat43" "stat44" "stat45" "stat46"
## [111] "stat47" "stat48" "stat49" "stat50" "stat51" "stat52" "stat53" "stat54" "stat55" "stat56"
## [121] "stat57" "stat58" "stat59" "stat60" "stat61" "stat62" "stat63" "stat64" "stat65" "stat66"
## [131] "stat67" "stat68" "stat69" "stat70" "stat71" "stat72" "stat73" "stat74" "stat75" "stat76"
## [141] "stat77" "stat78" "stat79" "stat80" "stat81" "stat82" "stat83" "stat84" "stat85" "stat86"
## [151] "stat87" "stat88" "stat89" "stat90" "stat91" "stat92" "stat93" "stat94" "stat95" "stat96"
## [161] "stat97" "stat98" "stat99" "stat100" "stat101" "stat102" "stat103" "stat104" "stat105" "stat106"
## [171] "stat107" "stat108" "stat109" "stat110" "stat111" "stat112" "stat113" "stat114" "stat115" "stat116"
## [181] "stat117" "stat118" "stat119" "stat120" "stat121" "stat122" "stat123" "stat124" "stat125" "stat126"
## [191] "stat127" "stat128" "stat129" "stat130" "stat131" "stat132" "stat133" "stat134" "stat135" "stat136"
## [201] "stat137" "stat138" "stat139" "stat140" "stat141" "stat142" "stat143" "stat144" "stat145" "stat146"
## [211] "stat147" "stat148" "stat149" "stat150" "stat151" "stat152" "stat153" "stat154" "stat155" "stat156"
## [221] "stat157" "stat158" "stat159" "stat160" "stat161" "stat162" "stat163" "stat164" "stat165" "stat166"
## [231] "stat167" "stat168" "stat169" "stat170" "stat171" "stat172" "stat173" "stat174" "stat175" "stat176"
## [241] "stat177" "stat178" "stat179" "stat180" "stat181" "stat182" "stat183" "stat184" "stat185" "stat186"
## [251] "stat187" "stat188" "stat189" "stat190" "stat191" "stat192" "stat193" "stat194" "stat195" "stat196"
## [261] "stat197" "stat198" "stat199" "stat200" "stat201" "stat202" "stat203" "stat204" "stat205" "stat206"
## [271] "stat207" "stat208" "stat209" "stat210" "stat211" "stat212" "stat213" "stat214" "stat215" "stat216"
## [281] "stat217"
All predictors show a Fat-Tail situation, where the two tails are very tall, and a low distribution around the mean. The orderNorm transformation can help (see [Best Normalizator] section)
Histograms
if (eda == TRUE){
cols = c('x11','x18','stat98','x7','stat110')
df=gather(select_at(data,cols))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=3)
# ggplot(gather(select_at(data,cols)), aes(sample=value)) +
# stat_qq()+
# facet_wrap(~key, scales = 'free',ncol=2)
lapply(select_at(data,cols),summary)
}
## $x11
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.000e-08 9.494e-08 1.001e-07 1.001e-07 1.052e-07 1.100e-07
##
## $x18
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.500 3.147 4.769 4.772 6.418 7.999
##
## $stat98
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.998619 -1.551882 -0.015993 -0.005946 1.528405 2.999499
##
## $x7
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.700 1.266 1.854 1.852 2.446 3.000
##
## $stat110
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.999543 -1.496865 -0.002193 -0.004129 1.504273 2.999563
Scatter plot vs. output variable **y3.bestnorm
if (eda == TRUE){
d = gather(dplyr::select_at(data,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light green',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=3)
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
All indicators have a strong indication of Fat-Tails
if (eda == TRUE){
df=gather(select_at(data,predictors))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=4)
}
if (eda == TRUE){
#chart.Correlation(select(data,-JobName), pch=21)
# https://stackoverflow.com/questions/27034655/how-to-use-dplyrarrangedesc-when-using-a-string-as-column-name
t=as.data.frame(round(cor(dplyr::select(data,-one_of(output.var.tr,'JobName'))
,select_at(data,output.var.tr)),4)) %>%
#rownames_to_column(var='variable') %>% filter(variable != !!output.var) %>% arrange(-y3.log)
rownames_to_column(var='variable') %>% filter(variable != !!output.var) %>% arrange(-!!sym(output.var.tr))
#DT::datatable(t)
message("Top Positive")
#kable(head(arrange(t,desc(y3.log)),20))
kable(head(arrange(t,desc(!!sym(output.var.tr))),20))
message("Top Negative")
#kable(head(arrange(t,y3.log),20))
kable(head(arrange(t,!!sym(output.var.tr)),20))
}
## Top Positive
## Top Negative
| variable | y3.bestnorm |
|---|---|
| x18sqinv | -0.3917 |
| x19byx18 | -0.2629 |
| x7sqinv | -0.2297 |
| stat110 | -0.1731 |
| x4 | -0.0621 |
| x16sqinv | -0.0540 |
| x9byx7 | -0.0456 |
| stat41 | -0.0350 |
| stat13 | -0.0338 |
| stat14 | -0.0320 |
| stat113 | -0.0300 |
| stat149 | -0.0293 |
| stat4 | -0.0268 |
| stat146 | -0.0253 |
| stat106 | -0.0231 |
| stat5 | -0.0229 |
| stat128 | -0.0228 |
| x8sqinv | -0.0225 |
| stat91 | -0.0222 |
| stat186 | -0.0214 |
if (eda == TRUE){
#chart.Correlation(select(data,-JobName), pch=21)
t=as.data.frame(round(cor(dplyr::select(data,-one_of('JobName'))),4))
#DT::datatable(t,options=list(scrollX=T))
message("Showing only 10 variables")
kable(t[1:10,1:10])
}
## Showing only 10 variables
| x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| x1 | 1.0000 | 0.0034 | -0.0028 | 0.0085 | 0.0068 | 0.0159 | 0.0264 | -0.0012 | 0.0142 | 0.0013 |
| x2 | 0.0034 | 1.0000 | -0.0057 | 0.0004 | -0.0094 | -0.0101 | 0.0089 | 0.0078 | 0.0049 | -0.0214 |
| x3 | -0.0028 | -0.0057 | 1.0000 | 0.0029 | 0.0046 | 0.0006 | -0.0105 | -0.0002 | 0.0167 | -0.0137 |
| x4 | 0.0085 | 0.0004 | 0.0029 | 1.0000 | -0.0059 | 0.0104 | 0.0098 | 0.0053 | 0.0061 | -0.0023 |
| x5 | 0.0068 | -0.0094 | 0.0046 | -0.0059 | 1.0000 | 0.0016 | -0.0027 | 0.0081 | 0.0259 | -0.0081 |
| x6 | 0.0159 | -0.0101 | 0.0006 | 0.0104 | 0.0016 | 1.0000 | 0.0200 | -0.0157 | 0.0117 | -0.0072 |
| x7 | 0.0264 | 0.0089 | -0.0105 | 0.0098 | -0.0027 | 0.0200 | 1.0000 | -0.0018 | -0.0069 | -0.0221 |
| x8 | -0.0012 | 0.0078 | -0.0002 | 0.0053 | 0.0081 | -0.0157 | -0.0018 | 1.0000 | 0.0142 | -0.0004 |
| x9 | 0.0142 | 0.0049 | 0.0167 | 0.0061 | 0.0259 | 0.0117 | -0.0069 | 0.0142 | 1.0000 | 0.0149 |
| x10 | 0.0013 | -0.0214 | -0.0137 | -0.0023 | -0.0081 | -0.0072 | -0.0221 | -0.0004 | 0.0149 | 1.0000 |
Scatter plots with all predictors and the output variable (y3.bestnorm)
if (eda == TRUE){
d = gather(dplyr::select_at(data,c(predictors,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
No Multicollinearity among predictors
Showing Top predictor by VIF Value
if (eda == TRUE){
vifDF = usdm::vif(select_at(data,predictors)) %>% arrange(desc(VIF))
head(vifDF,75)
}
## Variables VIF
## 1 x16log 3998.179733
## 2 x5log 2626.801410
## 3 x20log 1897.577768
## 4 x16 1843.299742
## 5 x11 1615.312389
## 6 x11log 1614.766671
## 7 x8log 1592.415543
## 8 x5 1213.925077
## 9 x20 883.283902
## 10 x8 747.717521
## 11 x7log 553.920786
## 12 x1log 535.598348
## 13 x16sqinv 457.897641
## 14 x13log 382.827473
## 15 x12log 365.277231
## 16 x18log 324.148910
## 17 x5sqinv 305.138318
## 18 x7 268.006786
## 19 x1 259.006574
## 20 x20sqinv 223.956024
## 21 x13 188.978165
## 22 x8sqinv 187.400711
## 23 x12 182.770980
## 24 x22log 173.743353
## 25 x18 161.523604
## 26 x22 88.181247
## 27 x7sqinv 67.434027
## 28 x1sqinv 64.903076
## 29 x21 57.388228
## 30 x2 48.202299
## 31 x13sqinv 47.509983
## 32 x21log 46.942688
## 33 x12sqinv 44.623944
## 34 x2log 42.512325
## 35 x18sqinv 40.924482
## 36 x23 38.095266
## 37 x23log 35.906981
## 38 x6 34.899869
## 39 x17 28.998150
## 40 x22sqinv 23.166022
## 41 x21byx20 23.114903
## 42 x6log 22.224931
## 43 x17byx16 22.055566
## 44 x6byx5 20.399989
## 45 x9 20.266476
## 46 x10 19.643009
## 47 x19 19.218389
## 48 x14 17.837002
## 49 x2byx1 17.350882
## 50 x10byx8 15.403096
## 51 x9log 14.663585
## 52 x19log 14.650667
## 53 x15 13.533471
## 54 x14log 12.798027
## 55 x17log 12.432304
## 56 x23byx22 11.837683
## 57 x9byx7 11.250566
## 58 x19byx18 10.642957
## 59 x14byx12 10.342193
## 60 x15byx13 9.467077
## 61 x15log 8.894967
## 62 x10log 8.839701
## 63 stat18 1.072090
## 64 stat150 1.071638
## 65 stat127 1.070999
## 66 stat154 1.069491
## 67 stat66 1.068601
## 68 stat130 1.068523
## 69 stat202 1.068477
## 70 stat156 1.068144
## 71 stat175 1.068102
## 72 stat15 1.068040
## 73 stat14 1.067993
## 74 stat2 1.067787
## 75 stat111 1.067680
data.tr=data %>%
mutate(x18.sqrt = sqrt(x18))
cols=c('x18','x18.sqrt')
# ggplot(gather(select_at(data.tr,cols)), aes(value)) +
# geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
# geom_density() +
# facet_wrap(~key, scales = 'free',ncol=4)
d = gather(dplyr::select_at(data.tr,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#removing unwanted variables
data.tr=data.tr %>%
#dplyr::select_at(names(data.tr)[! names(data.tr) %in% c('x18','y3','JobName')])
dplyr::select_at(names(data.tr)[! names(data.tr) %in% c('JobName')])
data=data.tr
label.names=output.var.tr
# 0 for no interaction,
# 1 for Full 2 way interaction and
# 2 for Selective 2 way interaction
# 3 for Selective 3 way interaction
InteractionMode = 2
pca.vars = names(data)
pca.vars = pca.vars[!pca.vars %in% label.names]
# http://sshaikh.org/2015/05/06/parallelize-machine-learning-in-r-with-multi-core-cpus/
# #cl <- makeCluster(ceiling(detectCores()*0.5)) # use 75% of cores only, leave rest for other tasks
cl <- makeCluster(detectCores()*0.75) # use 75% of cores only, leave rest for other tasks
registerDoParallel(cl)
if(InteractionMode == 1){
pca.formula =as.formula(paste0('~(',paste0(pca.vars, collapse ='+'),')^2'))
pca.model = prcomp(formula=pca.formula,data=data[,pca.vars],center=T,scale.=T,retx = T)
#saveRDS(pca.model,'pca.model.rds')
}
if (InteractionMode == 0){
pca.model = prcomp(x=data[,pca.vars],center=T,scale.=T,retx = T)
}
if (InteractionMode >= 2 & InteractionMode <= 3){
controlled.vars = pca.vars[grep("^x",pca.vars)]
stat.vars = pca.vars[grep("^stat",pca.vars)]
if (InteractionMode >= 2){
interaction.form = paste0('~(',paste0(controlled.vars, collapse ='+'),')^2')
}
if (InteractionMode >= 3){
interaction.form = paste0('~(',paste0(controlled.vars, collapse ='+'),')^3')
}
no.interact.form = paste0(stat.vars, collapse ='+')
pca.formula = as.formula(paste(interaction.form, no.interact.form, sep = "+"))
pca.model = prcomp(formula=pca.formula,data=data[,pca.vars],center=T,scale.=T,retx = T)
}
stopCluster(cl)
registerDoSEQ() # register sequential engine in case you are not using this function anymore
targetCumVar = .9
pca.model$var = pca.model$sdev ^ 2 #eigenvalues
pca.model$pvar = pca.model$var / sum(pca.model$var)
pca.model$cumpvar = cumsum(pca.model$pvar )
pca.model$pcaSel = pca.model$cumpvar<=targetCumVar
pca.model$pcaSelCount = sum(pca.model$pcaSel)
pca.model$pcaSelTotVar = sum(pca.model$pvar[pca.model$pcaSel])
message(pca.model$pcaSelCount, " PCAs justify ",percent(targetCumVar)," of the total Variance. (",percent(pca.model$pcaSelTotVar),")")
## 164 PCAs justify 90.0% of the total Variance. (90.0%)
plot(pca.model$var,xlab="Principal component", ylab="Proportion of variance explained", type='b')
plot(cumsum(pca.model$pvar ),xlab="Principal component", ylab="Cumulative Proportion of variance explained", ylim=c(0,1), type='b')
screeplot(pca.model,npcs = pca.model$pcaSelCount)
screeplot(pca.model,npcs = pca.model$pcaSelCount,type='lines')
#summary(pca.model)
#pca.model$rotation
#creating dataset
data.pca = dplyr::select(data,!!label.names) %>%
dplyr::bind_cols(dplyr::select(as.data.frame(pca.model$x)
,!!colnames(pca.model$rotation)[pca.model$pcaSel])
)
data.pca = data.pca[sample(nrow(data.pca)),] # randomly shuffle data
split = sample.split(data.pca[,label.names], SplitRatio = 0.8)
data.train = subset(data.pca, split == TRUE)
data.test = subset(data.pca, split == FALSE)
plot.diagnostics <- function(model, train) {
plot(model)
residuals = resid(model) # Plotted above in plot(lm.out)
r.standard = rstandard(model)
r.student = rstudent(model)
df = data.frame(x=predict(model,train),y=r.student)
p=ggplot(data=df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_hline(yintercept = 0,size=1)+
ylab("Student Residuals") +
xlab("Predicted Values")+
ggtitle("Student Residual Plot")
plot(p)
df = data.frame(x=predict(model,train),y=r.standard)
p=ggplot(data=df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_hline(yintercept = c(-2,0,2),size=1)+
ylab("Student Residuals") +
xlab("Predicted Values")+
ggtitle("Student Residual Plot")
plot(p)
# Histogram
df=data.frame(r.student)
p=ggplot(data=df,aes(r.student)) +
geom_histogram(aes(y=..density..),bins = 50,fill='blue',alpha=0.6) +
stat_function(fun = dnorm, n = 100, args = list(mean = 0, sd = 1)) +
ylab("Density")+
xlab("Studentized Residuals")+
ggtitle("Distribution of Studentized Residuals")
plot(p)
# http://www.stat.columbia.edu/~martin/W2024/R7.pdf
# Influential plots
inf.meas = influence.measures(model)
# print (summary(inf.meas)) # too much data
# Leverage plot
lev = hat(model.matrix(model))
df=tibble::rownames_to_column(as.data.frame(lev),'id')
p=ggplot(data=df,aes(x=as.numeric(id),y=lev)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
ylab('Leverage - check') +
xlab('Index')
plot(p)
# Cook's Distance
cd = cooks.distance(model)
df=tibble::rownames_to_column(as.data.frame(cd),'id')
p=ggplot(data=df,aes(x=as.numeric(id),y=cd)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_text(data=filter(df,cd>15/nrow(train)),aes(label=id),check_overlap=T,size=3,vjust=-.5)+
ylab('Cooks distances') +
geom_hline(yintercept = c(4/nrow(train),0),size=1)+
xlab('Index')
plot(p)
print (paste("Number of data points that have Cook's D > 4/n: ", length(cd[cd > 4/nrow(train)]), sep = ""))
print (paste("Number of data points that have Cook's D > 1: ", length(cd[cd > 1]), sep = ""))
return(cd)
}
# function to set up random seeds
# Based on http://jaehyeon-kim.github.io/2015/05/Setup-Random-Seeds-on-Caret-Package.html
setCaretSeeds <- function(method = "cv", numbers = 1, repeats = 1, tunes = NULL, seed = 1701) {
#B is the number of resamples and integer vector of M (numbers + tune length if any)
B <- if (method == "cv") numbers
else if(method == "repeatedcv") numbers * repeats
else NULL
if(is.null(length)) {
seeds <- NULL
} else {
set.seed(seed = seed)
seeds <- vector(mode = "list", length = B)
seeds <- lapply(seeds, function(x) sample.int(n = 1000000
, size = numbers + ifelse(is.null(tunes), 0, tunes)))
seeds[[length(seeds) + 1]] <- sample.int(n = 1000000, size = 1)
}
# return seeds
seeds
}
train.caret.glmselect = function(formula, data, method
,subopt = NULL, feature.names
, train.control = NULL, tune.grid = NULL, pre.proc = NULL){
if(is.null(train.control)){
train.control <- trainControl(method = "cv"
,number = 10
,seeds = setCaretSeeds(method = "cv"
, numbers = 10
, seed = 1701)
,search = "grid"
,verboseIter = TRUE
,allowParallel = TRUE
)
}
if(is.null(tune.grid)){
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
tune.grid = data.frame(nvmax = 1:length(feature.names))
}
if (method == 'glmnet' && subopt == 'LASSO'){
# Will only show 1 Lambda value during training, but that is OK
# https://stackoverflow.com/questions/47526544/why-need-to-tune-lambda-with-carettrain-method-glmnet-and-cv-glmnet
# Another option for LASSO is this: https://github.com/topepo/caret/blob/master/RegressionTests/Code/lasso.R
lambda = 10^seq(-2,0, length =100)
alpha = c(1)
tune.grid = expand.grid(alpha = alpha,lambda = lambda)
}
if (method == 'lars'){
# https://github.com/topepo/caret/blob/master/RegressionTests/Code/lars.R
fraction = seq(0, 1, length = 100)
tune.grid = expand.grid(fraction = fraction)
pre.proc = c("center", "scale")
}
}
# http://sshaikh.org/2015/05/06/parallelize-machine-learning-in-r-with-multi-core-cpus/
# #cl <- makeCluster(ceiling(detectCores()*0.5)) # use 75% of cores only, leave rest for other tasks
cl <- makeCluster(detectCores()*0.75) # use 75% of cores only, leave rest for other tasks
registerDoParallel(cl)
set.seed(1)
# note that the seed has to actually be set just before this function is called
# settign is above just not ensure reproducibility for some reason
model.caret <- caret::train(formula
, data = data
, method = method
, tuneGrid = tune.grid
, trControl = train.control
, preProc = pre.proc
)
stopCluster(cl)
registerDoSEQ() # register sequential engine in case you are not using this function anymore
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
print("All models results")
print(model.caret$results) # all model results
print("Best Model")
print(model.caret$bestTune) # best model
model = model.caret$finalModel
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-nvmax) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=nvmax,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
# leap function does not support studentized residuals
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
id = rownames(model.caret$bestTune)
# Provides the coefficients of the best model
# regsubsets doens return a full model (see documentation of regsubset), so we need to recalcualte themodel
# https://stackoverflow.com/questions/13063762/how-to-obtain-a-lm-object-from-regsubsets
print("Coefficients of final model:")
coefs <- coef(model, id=id)
#calculate the model to the the coef intervals
nams <- names(coefs)
nams <- nams[!nams %in% "(Intercept)"]
response <- as.character(formula[[2]])
form <- as.formula(paste(response, paste(nams, collapse = " + "), sep = " ~ "))
mod <- lm(form, data = data)
#coefs
#coef(mod)
print(car::Confint(mod))
return(list(model = model,id = id, residPlot = residPlot, residHistogram=residHistogram
,modelLM=mod))
}
if (method == 'glmnet' && subopt == 'LASSO'){
print(model.caret)
print(plot(model.caret))
print(model.caret$bestTune)
print(model.caret$results)
model=model.caret$finalModel
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-lambda) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=lambda,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
print("Coefficients")
#no interval for glmnet: https://stackoverflow.com/questions/39750965/confidence-intervals-for-ridge-regression
t=coef(model,s=model.caret$bestTune$lambda)
model.coef = t[which(t[,1]!=0),]
print(as.data.frame(model.coef))
id = NULL # not really needed but added for consistency
return(list(model = model.caret,id = id, residPlot = residPlot, metricsPlot=metricsPlot ))
}
if (method == 'lars'){
print(model.caret)
print(plot(model.caret))
print(model.caret$bestTune)
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-fraction) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=fraction,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
print("Coefficients")
t=coef(model.caret$finalModel,s=model.caret$bestTune$fraction,mode='fraction')
model.coef = t[which(t!=0)]
print(model.coef)
id = NULL # not really needed but added for consistency
return(list(model = model.caret,id = id, residPlot = residPlot, residHistogram=residHistogram))
}
}
# https://stackoverflow.com/questions/48265743/linear-model-subset-selection-goodness-of-fit-with-k-fold-cross-validation
# changed slightly since call[[2]] was just returning "formula" without actually returnign the value in formula
predict.regsubsets <- function(object, newdata, id, formula, ...) {
#form <- as.formula(object$call[[2]])
mat <- model.matrix(formula, newdata) # adds intercept and expands any interaction terms
coefi <- coef(object, id = id)
xvars <- names(coefi)
return(mat[,xvars]%*%coefi)
}
test.model = function(model, test, level=0.95
,draw.limits = FALSE, good = 0.1, ok = 0.15
,method = NULL, subopt = NULL
,id = NULL, formula, feature.names, label.names
,transformation = NULL){
## if using caret for glm select equivalent functionality,
## need to pass formula (full is ok as it will select subset of variables from there)
if (is.null(method)){
pred = predict(model, newdata=test, interval="confidence", level = level)
}
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
pred = predict.regsubsets(model, newdata = test, id = id, formula = formula)
}
if (method == 'glmnet' && subopt == 'LASSO'){
xtest = as.matrix(test[,feature.names])
pred=as.data.frame(predict(model, xtest))
}
if (method == 'lars'){
pred=as.data.frame(predict(model, newdata = test))
}
# Summary of predicted values
print ("Summary of predicted values: ")
print(summary(pred[,1]))
test.mse = mean((test[,label.names]-pred[,1])^2)
print (paste(method, subopt, "Test MSE:", test.mse, sep=" "))
test.rmse = sqrt(test.mse)
print (paste(method, subopt, "Test RMSE:", test.rmse, sep=" "))
if(log.pred == TRUE || norm.pred == TRUE){
# plot transformewd comparison first
df=data.frame(x=test[,label.names],y=pred[,1])
ggplot(df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_abline(slope=1,intercept=0,color='black',size=1) +
#scale_y_continuous(limits=c(min(df),max(df)))+
xlab("Actual (Transformed)")+
ylab("Predicted (Transformed)")
}
if (log.pred == FALSE && norm.pred == FALSE){
x = test[,label.names]
y = pred[,1]
}
if (log.pred == TRUE){
x = 10^test[,label.names]
y = 10^pred[,1]
}
if (norm.pred == TRUE){
x = predict(transformation, test[,label.names], inverse = TRUE)
y = predict(transformation, pred[,1], inverse = TRUE)
}
df=data.frame(x,y)
ggplot(df,aes(x,y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_abline(slope=c(1+good,1-good,1+ok,1-ok)
,intercept=rep(0,4),color=c('dark green','dark green','dark red','dark red'),size=1,alpha=0.8) +
#scale_y_continuous(limits=c(min(df),max(df)))+
xlab("Actual")+
ylab("Predicted")
}
n <- names(data.train)
formula <- as.formula(paste(paste(n[n %in% label.names], collapse = " + ")
," ~", paste(n[!n %in% label.names], collapse = " + ")))
grand.mean.formula = as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~ 1"))
print(formula)
## y3.bestnorm ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 +
## PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 +
## PC18 + PC19 + PC20 + PC21 + PC22 + PC23 + PC24 + PC25 + PC26 +
## PC27 + PC28 + PC29 + PC30 + PC31 + PC32 + PC33 + PC34 + PC35 +
## PC36 + PC37 + PC38 + PC39 + PC40 + PC41 + PC42 + PC43 + PC44 +
## PC45 + PC46 + PC47 + PC48 + PC49 + PC50 + PC51 + PC52 + PC53 +
## PC54 + PC55 + PC56 + PC57 + PC58 + PC59 + PC60 + PC61 + PC62 +
## PC63 + PC64 + PC65 + PC66 + PC67 + PC68 + PC69 + PC70 + PC71 +
## PC72 + PC73 + PC74 + PC75 + PC76 + PC77 + PC78 + PC79 + PC80 +
## PC81 + PC82 + PC83 + PC84 + PC85 + PC86 + PC87 + PC88 + PC89 +
## PC90 + PC91 + PC92 + PC93 + PC94 + PC95 + PC96 + PC97 + PC98 +
## PC99 + PC100 + PC101 + PC102 + PC103 + PC104 + PC105 + PC106 +
## PC107 + PC108 + PC109 + PC110 + PC111 + PC112 + PC113 + PC114 +
## PC115 + PC116 + PC117 + PC118 + PC119 + PC120 + PC121 + PC122 +
## PC123 + PC124 + PC125 + PC126 + PC127 + PC128 + PC129 + PC130 +
## PC131 + PC132 + PC133 + PC134 + PC135 + PC136 + PC137 + PC138 +
## PC139 + PC140 + PC141 + PC142 + PC143 + PC144 + PC145 + PC146 +
## PC147 + PC148 + PC149 + PC150 + PC151 + PC152 + PC153 + PC154 +
## PC155 + PC156 + PC157 + PC158 + PC159 + PC160 + PC161 + PC162 +
## PC163 + PC164
print(grand.mean.formula)
## y3.bestnorm ~ 1
# Update feature.names because we may have transformed some features
feature.names = n[!n %in% label.names]
model.full = lm(formula , data.train)
summary(model.full)
##
## Call:
## lm(formula = formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3584 -0.6062 -0.1077 0.5307 3.6869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.315e-03 1.135e-02 -0.556 0.578101
## PC1 -1.442e-02 9.959e-04 -14.483 < 2e-16 ***
## PC2 -2.725e-02 1.005e-03 -27.124 < 2e-16 ***
## PC3 -1.228e-02 1.010e-03 -12.159 < 2e-16 ***
## PC4 -1.033e-02 1.029e-03 -10.044 < 2e-16 ***
## PC5 5.074e-03 1.055e-03 4.811 1.54e-06 ***
## PC6 -2.569e-03 1.058e-03 -2.428 0.015196 *
## PC7 -4.412e-03 1.085e-03 -4.065 4.86e-05 ***
## PC8 -5.564e-04 1.105e-03 -0.504 0.614617
## PC9 -5.851e-04 1.136e-03 -0.515 0.606598
## PC10 -6.670e-04 1.144e-03 -0.583 0.559924
## PC11 -1.656e-02 1.227e-03 -13.497 < 2e-16 ***
## PC12 -1.335e-02 1.312e-03 -10.178 < 2e-16 ***
## PC13 7.690e-03 1.314e-03 5.851 5.16e-09 ***
## PC14 6.246e-03 1.370e-03 4.560 5.24e-06 ***
## PC15 -2.031e-03 1.392e-03 -1.459 0.144661
## PC16 1.094e-02 1.400e-03 7.816 6.51e-15 ***
## PC17 -3.722e-03 1.492e-03 -2.494 0.012666 *
## PC18 -1.117e-02 1.538e-03 -7.264 4.31e-13 ***
## PC19 1.893e-03 1.575e-03 1.202 0.229550
## PC20 1.159e-02 1.696e-03 6.835 9.08e-12 ***
## PC21 1.880e-03 1.787e-03 1.052 0.293004
## PC22 7.332e-04 2.761e-03 0.266 0.790567
## PC23 1.023e-02 3.442e-03 2.971 0.002985 **
## PC24 -2.562e-02 3.996e-03 -6.412 1.56e-10 ***
## PC25 8.260e-03 4.500e-03 1.835 0.066502 .
## PC26 1.200e-02 4.626e-03 2.594 0.009523 **
## PC27 1.432e-05 4.676e-03 0.003 0.997557
## PC28 4.507e-03 4.749e-03 0.949 0.342670
## PC29 1.076e-02 5.160e-03 2.085 0.037098 *
## PC30 5.177e-03 5.275e-03 0.982 0.326347
## PC31 -7.974e-03 5.703e-03 -1.398 0.162112
## PC32 -2.221e-02 5.742e-03 -3.868 0.000111 ***
## PC33 8.458e-03 5.879e-03 1.439 0.150296
## PC34 3.612e-02 6.183e-03 5.841 5.49e-09 ***
## PC35 -1.844e-03 6.729e-03 -0.274 0.784052
## PC36 4.558e-04 6.724e-03 0.068 0.945962
## PC37 -1.051e-02 6.893e-03 -1.524 0.127443
## PC38 6.729e-03 7.196e-03 0.935 0.349765
## PC39 3.158e-03 7.540e-03 0.419 0.675349
## PC40 -3.590e-03 7.439e-03 -0.483 0.629418
## PC41 -3.409e-03 7.580e-03 -0.450 0.652896
## PC42 2.905e-04 7.704e-03 0.038 0.969921
## PC43 1.343e-03 7.689e-03 0.175 0.861400
## PC44 1.407e-02 7.782e-03 1.808 0.070665 .
## PC45 9.475e-03 7.798e-03 1.215 0.224414
## PC46 6.793e-03 7.897e-03 0.860 0.389719
## PC47 -2.782e-02 7.841e-03 -3.547 0.000392 ***
## PC48 7.609e-03 7.980e-03 0.953 0.340418
## PC49 5.595e-03 7.964e-03 0.703 0.482346
## PC50 2.710e-03 8.090e-03 0.335 0.737604
## PC51 1.327e-03 8.189e-03 0.162 0.871260
## PC52 -1.337e-02 8.228e-03 -1.625 0.104238
## PC53 1.105e-02 8.194e-03 1.348 0.177686
## PC54 -5.839e-03 8.298e-03 -0.704 0.481693
## PC55 -3.546e-03 8.279e-03 -0.428 0.668451
## PC56 5.183e-03 8.361e-03 0.620 0.535362
## PC57 -1.816e-02 8.417e-03 -2.158 0.030968 *
## PC58 -4.693e-03 8.407e-03 -0.558 0.576742
## PC59 2.231e-02 8.534e-03 2.614 0.008972 **
## PC60 -5.522e-03 8.495e-03 -0.650 0.515683
## PC61 -3.432e-03 8.663e-03 -0.396 0.691963
## PC62 -4.361e-03 8.673e-03 -0.503 0.615128
## PC63 -1.834e-02 8.759e-03 -2.094 0.036292 *
## PC64 -2.083e-02 8.684e-03 -2.399 0.016491 *
## PC65 -1.104e-02 8.767e-03 -1.259 0.207929
## PC66 -5.997e-03 8.872e-03 -0.676 0.499105
## PC67 1.175e-02 8.763e-03 1.341 0.179944
## PC68 2.346e-02 8.782e-03 2.672 0.007573 **
## PC69 1.512e-02 8.918e-03 1.695 0.090147 .
## PC70 3.415e-03 8.893e-03 0.384 0.700998
## PC71 2.207e-02 8.851e-03 2.493 0.012696 *
## PC72 -4.681e-03 8.965e-03 -0.522 0.601607
## PC73 5.784e-03 9.028e-03 0.641 0.521766
## PC74 -1.319e-02 8.998e-03 -1.466 0.142702
## PC75 -2.295e-02 9.053e-03 -2.535 0.011285 *
## PC76 -4.076e-04 9.112e-03 -0.045 0.964324
## PC77 1.166e-02 9.132e-03 1.277 0.201796
## PC78 7.304e-03 9.211e-03 0.793 0.427826
## PC79 1.904e-02 9.239e-03 2.061 0.039374 *
## PC80 -1.818e-02 9.270e-03 -1.962 0.049862 *
## PC81 2.037e-02 9.319e-03 2.186 0.028857 *
## PC82 8.293e-03 9.375e-03 0.885 0.376403
## PC83 -4.149e-02 9.560e-03 -4.340 1.45e-05 ***
## PC84 2.750e-02 9.448e-03 2.911 0.003621 **
## PC85 2.957e-02 9.425e-03 3.138 0.001713 **
## PC86 -4.936e-03 9.503e-03 -0.519 0.603462
## PC87 5.437e-02 9.545e-03 5.696 1.29e-08 ***
## PC88 -2.783e-02 9.604e-03 -2.898 0.003773 **
## PC89 -1.848e-02 9.548e-03 -1.935 0.053054 .
## PC90 -1.905e-02 9.569e-03 -1.990 0.046586 *
## PC91 7.521e-03 9.603e-03 0.783 0.433562
## PC92 -3.611e-03 9.715e-03 -0.372 0.710174
## PC93 3.720e-03 9.727e-03 0.382 0.702133
## PC94 -2.196e-02 9.724e-03 -2.259 0.023938 *
## PC95 4.531e-03 9.751e-03 0.465 0.642183
## PC96 -1.789e-02 9.787e-03 -1.828 0.067640 .
## PC97 -7.830e-03 9.837e-03 -0.796 0.426082
## PC98 -6.208e-03 9.767e-03 -0.636 0.525079
## PC99 -2.079e-02 9.916e-03 -2.097 0.036065 *
## PC100 1.127e-04 9.905e-03 0.011 0.990924
## PC101 -5.848e-04 9.864e-03 -0.059 0.952729
## PC102 -1.876e-02 9.831e-03 -1.908 0.056381 .
## PC103 6.854e-03 9.898e-03 0.692 0.488660
## PC104 -2.047e-02 9.893e-03 -2.069 0.038619 *
## PC105 2.156e-02 9.936e-03 2.170 0.030069 *
## PC106 3.004e-02 9.938e-03 3.023 0.002514 **
## PC107 1.079e-02 9.927e-03 1.086 0.277345
## PC108 -5.820e-03 1.010e-02 -0.576 0.564601
## PC109 9.353e-03 1.003e-02 0.932 0.351356
## PC110 -2.737e-03 9.978e-03 -0.274 0.783883
## PC111 -1.887e-02 1.004e-02 -1.880 0.060131 .
## PC112 -4.413e-03 1.008e-02 -0.438 0.661590
## PC113 1.702e-02 1.008e-02 1.688 0.091498 .
## PC114 -2.018e-02 1.003e-02 -2.012 0.044221 *
## PC115 -5.565e-02 1.008e-02 -5.520 3.55e-08 ***
## PC116 -4.920e-03 1.008e-02 -0.488 0.625480
## PC117 -2.566e-03 1.011e-02 -0.254 0.799648
## PC118 7.161e-03 1.010e-02 0.709 0.478496
## PC119 -1.628e-02 1.008e-02 -1.615 0.106468
## PC120 1.702e-04 1.020e-02 0.017 0.986689
## PC121 -1.382e-02 1.015e-02 -1.362 0.173320
## PC122 2.326e-02 1.024e-02 2.271 0.023178 *
## PC123 -2.350e-02 1.024e-02 -2.295 0.021759 *
## PC124 6.562e-03 1.024e-02 0.641 0.521780
## PC125 1.154e-02 1.029e-02 1.121 0.262205
## PC126 3.730e-03 1.024e-02 0.364 0.715755
## PC127 8.970e-03 1.030e-02 0.871 0.383770
## PC128 -2.080e-02 1.040e-02 -2.001 0.045486 *
## PC129 5.160e-03 1.030e-02 0.501 0.616536
## PC130 9.797e-03 1.024e-02 0.957 0.338578
## PC131 -3.032e-02 1.024e-02 -2.961 0.003077 **
## PC132 -4.887e-03 1.038e-02 -0.471 0.637691
## PC133 -9.023e-03 1.032e-02 -0.874 0.381996
## PC134 2.785e-02 1.032e-02 2.697 0.007012 **
## PC135 5.914e-03 1.039e-02 0.569 0.569217
## PC136 1.123e-02 1.034e-02 1.085 0.277767
## PC137 -5.657e-03 1.045e-02 -0.542 0.588135
## PC138 1.908e-02 1.037e-02 1.841 0.065693 .
## PC139 -2.625e-02 1.042e-02 -2.519 0.011814 *
## PC140 -2.313e-02 1.046e-02 -2.212 0.027041 *
## PC141 3.776e-03 1.046e-02 0.361 0.717990
## PC142 2.317e-03 1.047e-02 0.221 0.824930
## PC143 1.980e-02 1.049e-02 1.887 0.059235 .
## PC144 2.208e-02 1.055e-02 2.093 0.036373 *
## PC145 1.088e-02 1.053e-02 1.033 0.301774
## PC146 4.552e-02 1.045e-02 4.356 1.35e-05 ***
## PC147 -1.060e-02 1.059e-02 -1.002 0.316557
## PC148 -3.336e-03 1.048e-02 -0.318 0.750301
## PC149 7.187e-03 1.060e-02 0.678 0.497898
## PC150 7.061e-03 1.055e-02 0.669 0.503536
## PC151 1.959e-02 1.055e-02 1.857 0.063361 .
## PC152 -2.294e-02 1.065e-02 -2.155 0.031231 *
## PC153 2.530e-02 1.060e-02 2.388 0.016997 *
## PC154 -1.541e-02 1.058e-02 -1.457 0.145217
## PC155 2.083e-02 1.066e-02 1.954 0.050730 .
## PC156 2.430e-02 1.069e-02 2.274 0.023035 *
## PC157 -2.769e-04 1.061e-02 -0.026 0.979179
## PC158 -4.518e-03 1.073e-02 -0.421 0.673731
## PC159 4.899e-02 1.079e-02 4.542 5.69e-06 ***
## PC160 7.488e-03 1.065e-02 0.703 0.481850
## PC161 7.412e-04 1.073e-02 0.069 0.944932
## PC162 -4.041e-02 1.074e-02 -3.761 0.000171 ***
## PC163 1.808e-02 1.070e-02 1.690 0.091079 .
## PC164 8.018e-03 1.072e-02 0.748 0.454407
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8463 on 5419 degrees of freedom
## Multiple R-squared: 0.2998, Adjusted R-squared: 0.2786
## F-statistic: 14.15 on 164 and 5419 DF, p-value: < 2.2e-16
cd.full = plot.diagnostics(model=model.full, train=data.train)
## [1] "Number of data points that have Cook's D > 4/n: 286"
## [1] "Number of data points that have Cook's D > 1: 0"
high.cd = names(cd.full[cd.full > 4/nrow(data.train)])
#save dataset with high.cd flagged
t = data.train %>%
rownames_to_column() %>%
mutate(high.cd = ifelse(rowname %in% high.cd,1,0))
#write.csv(t,file='data_high_cd_flag.csv',row.names = F)
###
data.train2 = data.train[!(rownames(data.train)) %in% high.cd,]
model.full2 = lm(formula , data.train2)
summary(model.full2)
##
## Call:
## lm(formula = formula, data = data.train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.72428 -0.54378 -0.08082 0.51992 2.18917
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.228e-02 1.006e-02 -7.184 7.75e-13 ***
## PC1 -1.532e-02 9.080e-04 -16.876 < 2e-16 ***
## PC2 -2.823e-02 9.040e-04 -31.229 < 2e-16 ***
## PC3 -1.258e-02 9.015e-04 -13.958 < 2e-16 ***
## PC4 -1.206e-02 9.179e-04 -13.136 < 2e-16 ***
## PC5 5.074e-03 9.389e-04 5.405 6.78e-08 ***
## PC6 -2.238e-03 9.388e-04 -2.384 0.017171 *
## PC7 -4.551e-03 9.646e-04 -4.718 2.44e-06 ***
## PC8 -1.147e-03 9.813e-04 -1.169 0.242433
## PC9 1.306e-04 1.011e-03 0.129 0.897182
## PC10 -3.544e-04 1.017e-03 -0.349 0.727439
## PC11 -1.767e-02 1.089e-03 -16.220 < 2e-16 ***
## PC12 -1.398e-02 1.161e-03 -12.040 < 2e-16 ***
## PC13 7.724e-03 1.167e-03 6.618 4.02e-11 ***
## PC14 6.906e-03 1.216e-03 5.678 1.43e-08 ***
## PC15 -1.706e-03 1.235e-03 -1.381 0.167449
## PC16 1.105e-02 1.240e-03 8.908 < 2e-16 ***
## PC17 -4.064e-03 1.322e-03 -3.075 0.002119 **
## PC18 -1.071e-02 1.365e-03 -7.844 5.26e-15 ***
## PC19 2.750e-03 1.394e-03 1.973 0.048566 *
## PC20 1.294e-02 1.501e-03 8.621 < 2e-16 ***
## PC21 1.354e-03 1.581e-03 0.856 0.391996
## PC22 1.457e-03 2.439e-03 0.598 0.550181
## PC23 1.003e-02 3.108e-03 3.228 0.001254 **
## PC24 -3.063e-02 3.588e-03 -8.539 < 2e-16 ***
## PC25 7.460e-03 4.023e-03 1.854 0.063759 .
## PC26 1.204e-02 4.130e-03 2.916 0.003559 **
## PC27 8.563e-05 4.169e-03 0.021 0.983615
## PC28 3.105e-03 4.253e-03 0.730 0.465379
## PC29 7.652e-03 4.566e-03 1.676 0.093784 .
## PC30 5.468e-03 4.720e-03 1.158 0.246755
## PC31 -9.105e-03 5.115e-03 -1.780 0.075117 .
## PC32 -2.453e-02 5.098e-03 -4.811 1.55e-06 ***
## PC33 1.709e-03 5.320e-03 0.321 0.748048
## PC34 3.717e-02 5.540e-03 6.709 2.17e-11 ***
## PC35 5.929e-03 6.089e-03 0.974 0.330265
## PC36 -6.969e-04 6.009e-03 -0.116 0.907685
## PC37 -1.383e-02 6.120e-03 -2.259 0.023915 *
## PC38 8.195e-03 6.368e-03 1.287 0.198204
## PC39 2.505e-03 7.122e-03 0.352 0.725053
## PC40 -2.872e-03 6.740e-03 -0.426 0.669991
## PC41 -3.902e-03 6.836e-03 -0.571 0.568133
## PC42 3.259e-03 6.906e-03 0.472 0.637035
## PC43 5.384e-03 6.921e-03 0.778 0.436679
## PC44 1.114e-02 7.155e-03 1.556 0.119660
## PC45 8.533e-03 7.006e-03 1.218 0.223282
## PC46 4.714e-03 7.060e-03 0.668 0.504370
## PC47 -2.454e-02 7.099e-03 -3.457 0.000550 ***
## PC48 8.159e-03 7.146e-03 1.142 0.253629
## PC49 7.807e-03 7.175e-03 1.088 0.276598
## PC50 -2.654e-03 7.313e-03 -0.363 0.716675
## PC51 1.029e-03 7.363e-03 0.140 0.888915
## PC52 -1.588e-02 7.491e-03 -2.120 0.034043 *
## PC53 2.159e-02 7.385e-03 2.923 0.003482 **
## PC54 -9.829e-03 7.514e-03 -1.308 0.190913
## PC55 -7.491e-03 7.423e-03 -1.009 0.312986
## PC56 2.952e-03 7.589e-03 0.389 0.697270
## PC57 -2.295e-02 7.560e-03 -3.036 0.002412 **
## PC58 -1.503e-02 7.616e-03 -1.974 0.048464 *
## PC59 2.805e-02 7.707e-03 3.640 0.000275 ***
## PC60 -9.201e-03 7.657e-03 -1.202 0.229575
## PC61 -5.967e-03 7.765e-03 -0.769 0.442203
## PC62 6.498e-04 7.841e-03 0.083 0.933954
## PC63 -1.061e-02 7.868e-03 -1.349 0.177450
## PC64 -2.471e-02 7.801e-03 -3.168 0.001546 **
## PC65 -1.703e-02 7.931e-03 -2.147 0.031807 *
## PC66 -8.286e-03 8.041e-03 -1.030 0.302847
## PC67 8.677e-03 7.888e-03 1.100 0.271343
## PC68 2.412e-02 7.933e-03 3.041 0.002369 **
## PC69 1.933e-02 8.015e-03 2.412 0.015889 *
## PC70 9.440e-03 7.946e-03 1.188 0.234876
## PC71 1.284e-02 7.943e-03 1.616 0.106088
## PC72 -2.417e-03 8.034e-03 -0.301 0.763543
## PC73 2.898e-03 8.058e-03 0.360 0.719088
## PC74 -1.422e-03 8.058e-03 -0.177 0.859898
## PC75 -2.283e-02 8.094e-03 -2.820 0.004819 **
## PC76 -6.063e-03 8.162e-03 -0.743 0.457572
## PC77 1.238e-02 8.103e-03 1.528 0.126558
## PC78 2.186e-03 8.219e-03 0.266 0.790259
## PC79 2.223e-02 8.266e-03 2.689 0.007188 **
## PC80 -1.406e-02 8.228e-03 -1.709 0.087486 .
## PC81 2.225e-02 8.320e-03 2.674 0.007519 **
## PC82 1.005e-03 8.369e-03 0.120 0.904377
## PC83 -4.311e-02 8.556e-03 -5.038 4.86e-07 ***
## PC84 2.437e-02 8.465e-03 2.879 0.004008 **
## PC85 3.344e-02 8.424e-03 3.970 7.30e-05 ***
## PC86 7.561e-04 8.498e-03 0.089 0.929113
## PC87 4.751e-02 8.470e-03 5.608 2.15e-08 ***
## PC88 -2.448e-02 8.571e-03 -2.856 0.004303 **
## PC89 -1.226e-02 8.521e-03 -1.439 0.150131
## PC90 -2.074e-02 8.525e-03 -2.433 0.015019 *
## PC91 4.590e-04 8.571e-03 0.054 0.957297
## PC92 2.681e-03 8.646e-03 0.310 0.756514
## PC93 -6.473e-03 8.680e-03 -0.746 0.455833
## PC94 -1.800e-02 8.638e-03 -2.084 0.037181 *
## PC95 -1.733e-03 8.729e-03 -0.199 0.842619
## PC96 -1.615e-02 8.708e-03 -1.854 0.063782 .
## PC97 -8.392e-03 8.730e-03 -0.961 0.336470
## PC98 -1.755e-02 8.677e-03 -2.023 0.043146 *
## PC99 -1.485e-02 8.802e-03 -1.687 0.091615 .
## PC100 -3.439e-03 8.781e-03 -0.392 0.695378
## PC101 -1.106e-02 8.768e-03 -1.261 0.207371
## PC102 -1.673e-02 8.708e-03 -1.921 0.054802 .
## PC103 5.411e-03 8.779e-03 0.616 0.537709
## PC104 -1.997e-02 8.799e-03 -2.269 0.023285 *
## PC105 2.431e-02 8.908e-03 2.730 0.006362 **
## PC106 2.357e-02 8.839e-03 2.667 0.007682 **
## PC107 8.179e-03 8.870e-03 0.922 0.356474
## PC108 -2.930e-03 8.958e-03 -0.327 0.743622
## PC109 6.896e-03 8.922e-03 0.773 0.439583
## PC110 -1.579e-03 8.846e-03 -0.179 0.858310
## PC111 -2.739e-02 8.944e-03 -3.063 0.002204 **
## PC112 -1.573e-02 8.951e-03 -1.758 0.078844 .
## PC113 1.574e-02 8.954e-03 1.758 0.078793 .
## PC114 -1.805e-02 8.892e-03 -2.030 0.042446 *
## PC115 -5.918e-02 8.963e-03 -6.602 4.47e-11 ***
## PC116 -1.338e-04 8.966e-03 -0.015 0.988098
## PC117 -1.962e-03 8.982e-03 -0.218 0.827128
## PC118 4.167e-03 8.951e-03 0.466 0.641535
## PC119 -1.356e-02 8.953e-03 -1.514 0.130019
## PC120 -2.350e-03 9.102e-03 -0.258 0.796278
## PC121 -1.623e-02 9.002e-03 -1.803 0.071393 .
## PC122 2.522e-02 9.077e-03 2.779 0.005480 **
## PC123 -1.596e-02 9.093e-03 -1.756 0.079211 .
## PC124 1.725e-04 9.117e-03 0.019 0.984903
## PC125 1.915e-02 9.149e-03 2.094 0.036333 *
## PC126 5.447e-03 9.085e-03 0.600 0.548803
## PC127 2.662e-03 9.144e-03 0.291 0.770994
## PC128 -2.259e-02 9.206e-03 -2.454 0.014152 *
## PC129 -3.956e-03 9.164e-03 -0.432 0.665992
## PC130 1.205e-02 9.060e-03 1.330 0.183675
## PC131 -2.445e-02 9.093e-03 -2.688 0.007201 **
## PC132 -5.131e-03 9.185e-03 -0.559 0.576427
## PC133 -7.879e-03 9.229e-03 -0.854 0.393305
## PC134 2.182e-02 9.159e-03 2.383 0.017222 *
## PC135 2.001e-03 9.208e-03 0.217 0.827976
## PC136 1.357e-02 9.229e-03 1.471 0.141411
## PC137 -1.480e-02 9.244e-03 -1.601 0.109357
## PC138 1.679e-02 9.240e-03 1.817 0.069201 .
## PC139 -1.946e-02 9.259e-03 -2.102 0.035607 *
## PC140 -2.736e-02 9.241e-03 -2.961 0.003081 **
## PC141 2.027e-03 9.283e-03 0.218 0.827115
## PC142 1.054e-02 9.274e-03 1.137 0.255702
## PC143 1.917e-02 9.316e-03 2.058 0.039653 *
## PC144 1.432e-02 9.395e-03 1.524 0.127542
## PC145 6.474e-03 9.334e-03 0.694 0.487945
## PC146 5.412e-02 9.294e-03 5.824 6.11e-09 ***
## PC147 -8.311e-03 9.426e-03 -0.882 0.378007
## PC148 -1.435e-03 9.258e-03 -0.155 0.876869
## PC149 1.680e-03 9.385e-03 0.179 0.857948
## PC150 1.412e-02 9.440e-03 1.496 0.134799
## PC151 2.682e-02 9.357e-03 2.867 0.004167 **
## PC152 -2.556e-02 9.418e-03 -2.714 0.006675 **
## PC153 2.453e-02 9.391e-03 2.612 0.009021 **
## PC154 -1.627e-02 9.398e-03 -1.732 0.083388 .
## PC155 1.949e-02 9.443e-03 2.063 0.039121 *
## PC156 1.748e-02 9.491e-03 1.842 0.065565 .
## PC157 -1.449e-03 9.429e-03 -0.154 0.877872
## PC158 2.530e-03 9.524e-03 0.266 0.790540
## PC159 4.142e-02 9.576e-03 4.326 1.55e-05 ***
## PC160 1.343e-02 9.510e-03 1.412 0.157982
## PC161 -6.157e-03 9.549e-03 -0.645 0.519097
## PC162 -4.410e-02 9.505e-03 -4.640 3.58e-06 ***
## PC163 1.259e-02 9.543e-03 1.319 0.187102
## PC164 8.200e-03 9.548e-03 0.859 0.390484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7285 on 5133 degrees of freedom
## Multiple R-squared: 0.3761, Adjusted R-squared: 0.3561
## F-statistic: 18.86 on 164 and 5133 DF, p-value: < 2.2e-16
cd.full2 = plot.diagnostics(model.full2, data.train2)
## [1] "Number of data points that have Cook's D > 4/n: 200"
## [1] "Number of data points that have Cook's D > 1: 0"
# much more normal residuals than before.
# Checking to see if distributions are different and if so whcih variables
# High Leverage Plot
plotData = data.train %>%
rownames_to_column() %>%
mutate(type=ifelse(rowname %in% high.cd,'High','Normal')) %>%
dplyr::select(type,target=one_of(label.names))
ggplot(data=plotData, aes(x=type,y=target)) +
geom_boxplot(fill='light blue',outlier.shape=NA) +
scale_y_continuous(name="Target Variable Values",label=scales::comma_format(accuracy=.1)) +
theme_light() +
ggtitle('Distribution of High Leverage Points and Normal Points')
# 2 sample t-tests
plotData = data.train %>%
rownames_to_column() %>%
mutate(type=ifelse(rowname %in% high.cd,'High','Normal')) %>%
dplyr::select(type,one_of(feature.names))
comp.test = lapply(dplyr::select(plotData, one_of(feature.names))
, function(x) t.test(x ~ plotData$type, var.equal = TRUE))
sig.comp = list.filter(comp.test, p.value < 0.05)
sapply(sig.comp, function(x) x[['p.value']])
## PC1 PC2 PC3 PC4 PC6 PC7 PC11 PC23 PC24
## 1.545781e-15 2.422740e-03 3.278990e-04 2.406406e-02 4.866123e-02 2.404537e-04 3.754818e-06 7.122452e-06 1.515026e-08
## PC25 PC26 PC28 PC31 PC33 PC35 PC39 PC44 PC45
## 4.724172e-02 5.876494e-05 2.190500e-02 1.115662e-03 3.391806e-02 6.989040e-03 1.846662e-05 1.705590e-02 1.262332e-02
## PC46 PC66 PC69 PC85 PC129
## 4.255818e-03 1.886917e-02 3.711557e-02 1.022814e-02 4.710894e-02
mm = melt(plotData, id=c('type')) %>% filter(variable %in% names(sig.comp))
ggplot(mm,aes(x=type, y=value)) +
geom_boxplot()+
facet_wrap(~variable, ncol=5, scales = 'free_y') +
scale_y_continuous(name="values",label=scales::comma_format(accuracy=.1)) +
ggtitle('Distribution of High Leverage Points and Normal Points')
# Distribution (box) Plots
mm = melt(plotData, id=c('type'))
ggplot(mm,aes(x=type, y=value)) +
geom_boxplot()+
facet_wrap(~variable, ncol=8, scales = 'free_y') +
scale_y_continuous(name="values",label=scales::comma_format(accuracy=.1)) +
ggtitle('Distribution of High Leverage Points and Normal Points')
model.null = lm(grand.mean.formula, data.train)
summary(model.null)
##
## Call:
## lm(formula = grand.mean.formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1912 -0.6316 -0.0082 0.6402 3.7005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.006587 0.013335 -0.494 0.621
##
## Residual standard error: 0.9964 on 5583 degrees of freedom
Basic: http://www.stat.columbia.edu/~martin/W2024/R10.pdf Cross Validation + Other Metrics: http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/
if (algo.forward.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
, data = data.train
, method = "leapForward"
, feature.names = feature.names)
model.forward = returned$model
id = returned$id
}
if (algo.forward.caret == TRUE){
test.model(model=model.forward, test=data.test
,method = 'leapForward',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
if (algo.backward.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "leapBackward"
,feature.names = feature.names)
model.backward = returned$model
id = returned$id
}
if (algo.backward.caret == TRUE){
test.model(model.backward, data.test
,method = 'leapBackward',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
if (algo.stepwise.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "leapSeq"
,feature.names = feature.names)
model.stepwise = returned$model
id = returned$id
}
if (algo.stepwise.caret == TRUE){
test.model(model.stepwise, data.test
,method = 'leapSeq',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
if (algo.LASSO.caret == TRUE){
set.seed(1)
tune.grid= expand.grid(alpha = 1,lambda = 10^seq(from=-4,to=-2,length=100))
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "glmnet"
,subopt = 'LASSO'
,tune.grid = tune.grid
,feature.names = feature.names)
model.LASSO.caret = returned$model
}
if (algo.LASSO.caret == TRUE){
test.model(model.LASSO.caret, data.test
,method = 'glmnet',subopt = "LASSO"
,formula = formula, feature.names = feature.names, label.names = label.names
,draw.limits = TRUE, transformation = t)
}
if (algo.LARS.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "lars"
,subopt = 'NULL'
,feature.names = feature.names)
model.LARS.caret = returned$model
}
if (algo.LARS.caret == TRUE){
test.model(model.LARS.caret, data.test
,method = 'lars',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,draw.limits = TRUE, transformation = t)
}
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252 LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C LC_TIME=English_United States.1252
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 knitr_1.20 htmltools_0.3.6 reshape2_1.4.3
## [5] lars_1.2 doParallel_1.0.14 iterators_1.0.10 caret_6.0-81
## [9] leaps_3.0 ggforce_0.1.3 rlist_0.4.6.1 car_3.0-2
## [13] carData_3.0-2 bestNormalize_1.3.0 scales_1.0.0 onewaytests_2.0
## [17] caTools_1.17.1.1 mosaic_1.5.0 mosaicData_0.17.0 ggformula_0.9.1
## [21] ggstance_0.3.1 lattice_0.20-35 DT_0.5 ggiraph_0.6.0
## [25] investr_1.4.0 glmnet_2.0-16 foreach_1.4.4 Matrix_1.2-14
## [29] MASS_7.3-50 PerformanceAnalytics_1.5.2 xts_0.11-2 zoo_1.8-4
## [33] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8 purrr_0.2.5
## [37] readr_1.3.1 tidyr_0.8.2 tibble_1.4.2 ggplot2_3.1.0
## [41] tidyverse_1.2.1 usdm_1.1-18 raster_2.8-4 sp_1.3-1
## [45] pacman_0.5.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.2.0 backports_1.1.3 plyr_1.8.4 lazyeval_0.2.1 splines_3.5.1 mycor_0.1.1
## [7] crosstalk_1.0.0 leaflet_2.0.2 digest_0.6.18 magrittr_1.5 mosaicCore_0.6.0 openxlsx_4.1.0
## [13] recipes_0.1.4 modelr_0.1.2 gower_0.1.2 colorspace_1.3-2 rvest_0.3.2 ggrepel_0.8.0
## [19] haven_2.0.0 crayon_1.3.4 jsonlite_1.5 bindr_0.1.1 survival_2.42-3 glue_1.3.0
## [25] registry_0.5 gtable_0.2.0 ppcor_1.1 ipred_0.9-8 abind_1.4-5 rngtools_1.3.1
## [31] bibtex_0.4.2 Rcpp_1.0.0 xtable_1.8-3 units_0.6-2 foreign_0.8-70 stats4_3.5.1
## [37] lava_1.6.4 prodlim_2018.04.18 htmlwidgets_1.3 httr_1.4.0 RColorBrewer_1.1-2 pkgconfig_2.0.2
## [43] farver_1.1.0 nnet_7.3-12 labeling_0.3 tidyselect_0.2.5 rlang_0.3.1 later_0.7.5
## [49] munsell_0.5.0 cellranger_1.1.0 tools_3.5.1 cli_1.0.1 generics_0.0.2 moments_0.14
## [55] sjlabelled_1.0.17 broom_0.5.1 evaluate_0.12 ggdendro_0.1-20 yaml_2.2.0 ModelMetrics_1.2.2
## [61] zip_2.0.1 nlme_3.1-137 doRNG_1.7.1 mime_0.6 xml2_1.2.0 compiler_3.5.1
## [67] rstudioapi_0.8 curl_3.2 tweenr_1.0.1 stringi_1.2.4 highr_0.7 gdtools_0.1.7
## [73] pillar_1.3.1 data.table_1.11.8 bitops_1.0-6 insight_0.1.2 httpuv_1.4.5 R6_2.3.0
## [79] promises_1.0.1 gridExtra_2.3 rio_0.5.16 codetools_0.2-15 assertthat_0.2.0 pkgmaker_0.27
## [85] withr_2.1.2 nortest_1.0-4 mgcv_1.8-24 hms_0.4.2 quadprog_1.5-5 grid_3.5.1
## [91] rpart_4.1-13 timeDate_3043.102 class_7.3-14 rmarkdown_1.11 shiny_1.2.0 lubridate_1.7.4